The current new hotness for retweeting and manipulating twitter data in the R world is rtweet, which is what we’ll use today.
The basic outline of code and analysis will be a mishmash of Mike Kearney’s Ruser2018 analysis and my old NIPS2016 analysis. Kearney’s analysis is quite good, I recommend it for anyone that is interested in methodology.
The network vis also samples from a notebook from a cool biologist Kenneth Turner, check out his notebook
He (Kearney)’s the author of rtweet, so often there’s a bit more of the details sprinkled throughout the analysis.
Before beginning, I’ll sprinkle in some of the highly retweets from the first couple days.
First, a good tweet:
Follow our #ESTRO37 Ambassadors, the best way to be informed of what’s going on in any track, whether you are in Barcelona or not! pic.twitter.com/r73mXWHPi5
— ESTRO (@ESTRO_RT) April 18, 2018
Then, a curious tweet (well, I think conferences that still try to lock down taking pictures are fighting against a river at this point):
Follow our #ESTRO37 Ambassadors, the best way to be informed of what’s going on in any track, whether you are in Barcelona or not! pic.twitter.com/r73mXWHPi5
— ESTRO (@ESTRO_RT) April 18, 2018
library(rtweet)
searchfield <-c("ESTRO37")
if (file.exists(file.path("data", "search.rds"))) {
since_id <- readRDS(file.path("data", "search.rds"))
since_id <- since_id$status_id[1]
} else {
since_id <- NULL
}
## search for up to 100,000 tweets mentionging rstudio::conf
rt <- search_tweets(
paste(searchfield, collapse = " OR "),
n = 1e5, verbose = FALSE,
since_id = since_id,
retryonratelimit = TRUE
)
## if there's already a search data file saved, then read it in,
## drop the duplicates, and then update the `rt` data object
if (file.exists(file.path("data", "search.rds"))) {
## bind rows (for tweets AND users data)
rt <- do_call_rbind(
list(rt, readRDS(file.path("data", "search.rds"))))
## determine whether each observation has a unique status ID
kp <- !duplicated(rt$status_id)
## only keep rows (observations) with unique status IDs
users <- users_data(rt)[kp, ]
## the rows of users should correspond with the tweets
rt <- rt[kp, ]
## restore as users attribute
attr(rt, "users") <- users
}
## save the data
saveRDS(rt, file.path("data", "search.rds"))
## save shareable data (only status_ids)
saveRDS(rt[, "status_id"], file.path("data", "search-ids.rds"))
Time series of the data in two hour chunks. One REALLY nice thing about rtweet is that it makes plotting the timeseries of tweet a completely lazy-person’s function, ts_plot, where you can feed it the aggregation time to summarize over. Here, we go with 2 hour’s as that seems like a good medium to begin with.
suppressPackageStartupMessages(library(tidyverse))
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
rt %>%
filter(created_at > "2018-01-29") %>%
ts_plot("2 hours", color = "transparent") +
geom_smooth(method = "loess", se = FALSE, span = .1,
size = 2, colour = "#0066aa") +
geom_point(size = 5,
shape = 21, fill = "#ADFF2F99", colour = "#000000dd") +
theme(axis.text = element_text(colour = "#222222"),
text=element_text('Roboto Condensed'),
plot.title = element_text(size = rel(1.7), face = "bold"),
plot.subtitle = element_text(size = rel(1.3)),
plot.caption = element_text(colour = "#444444")) +
labs(title = "Frequency of tweets about ESTRO37 over time",
subtitle = "Twitter status counts aggregated using two-hour intervals",
caption = "\n\nSource: Data gathered via Twitter's standard `search/tweets` API using rtweet",
x = NULL, y = NULL)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1.5242e+09
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 15552
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.4186e+08
Again, another analysis I’ve done in the past, but here I’ll use Mike Kearney’s version simply to make my life a bit easier as it’s mapping on to rtweet’s data, and I haven’t actually used the syuzhet, which is a commonly used sentiment analysis package.
## clean up the text a bit (rm mentions and links)
rt$text2 <- gsub(
"^RT:?\\s{0,}|#|@\\S+|https?[[:graph:]]+", "", rt$text)
## convert to lower case
rt$text2 <- tolower(rt$text2)
## trim extra white space
rt$text2 <- gsub("^\\s{1,}|\\s{1,}$", "", rt$text2)
rt$text2 <- gsub("\\s{2,}", " ", rt$text2)
## estimate pos/neg sentiment for each tweet
rt$sentiment <- syuzhet::get_sentiment(rt$text2, "syuzhet")
## write function to round time into rounded var
round_time <- function(x, sec) {
as.POSIXct(hms::hms(as.numeric(x) %/% sec * sec))
}
## plot by specified time interval (1-hours)
rt %>%
mutate(time = round_time(created_at, 60 * 60)) %>%
group_by(time) %>%
summarise(sentiment = mean(sentiment, na.rm = TRUE)) %>%
mutate(valence = ifelse(sentiment > 0L, "Positive", "Negative")) %>%
ggplot(aes(x = time, y = sentiment)) +
geom_smooth(method = "loess", span = .1,
colour = "#aa11aadd", fill = "#bbbbbb11") +
geom_point(aes(fill = valence, colour = valence),
shape = 21, alpha = .6, size = 3.5) +
theme(legend.position = "none",
text=element_text(family='Roboto Condensed'),
axis.text = element_text(colour = "#222222"),
plot.title = element_text(size = rel(1.7), face = "bold"),
plot.subtitle = element_text(size = rel(1.3)),
plot.caption = element_text(colour = "#444444")) +
scale_fill_manual(
values = c(Positive = "#2244ee", Negative = "#dd2222")) +
scale_colour_manual(
values = c(Positive = "#001155", Negative = "#550000")) +
labs(x = NULL, y = NULL,
title = "Sentiment (valence) of ESTRO37 tweets over time",
subtitle = "Mean sentiment of tweets aggregated in one-hour intervals",
caption = "\nSource: Data gathered using rtweet. Sentiment analysis done using syuzhet")
So… Who are the top ranking tweeps currently?
showvals=rt %>% select(favorite_count,retweet_count,screen_name,name) %>%
group_by(screen_name,name) %>%
summarise(fav_count=sum(favorite_count),
rt_count=sum(retweet_count),
n=n()) %>% arrange(-n)
knitr::kable(showvals[1:40,])
| screen_name | name | fav_count | rt_count | n |
|---|---|---|---|---|
| FuenteApolo | Castalia | 205 | 1149 | 270 |
| roentgen66 | Virginia Ruiz | 0 | 989 | 224 |
| JoaquinJCabrera | JoaquÃn J Cabrera | 135 | 514 | 122 |
| davidbermudezi | Ramiro Bermudez-I.MD | 0 | 696 | 121 |
| cd_fuller | dave fuller | 3 | 463 | 93 |
| BreastDocUK | Richard Simcock | 390 | 493 | 81 |
| anitaodonovan1 | Anita O’Donovan | 260 | 390 | 73 |
| LuisAlberto3P | Dr. Luis A. Pérez-Romasanta | 90 | 431 | 71 |
| AmadeoWals | Amadeo Wals | 35 | 251 | 63 |
| Monthy_A | Angel Montero Luis | 34 | 371 | 59 |
| MsConcu | Tere M. Migueláñez | 76 | 187 | 53 |
| subatomicdoc | Matthew Katz, MD | 62 | 314 | 51 |
| p_blancha | Pierre Blanchard, MD | 215 | 340 | 50 |
| CancerGeek | CancerGeek | 139 | 216 | 47 |
| Dr_ASalem | Ahmed Salem | 28 | 167 | 44 |
| RT_physics | RTphysics Manchester | 221 | 173 | 43 |
| Rad_Nation | Radiation Nation | 106 | 284 | 37 |
| FREELANCEROG | FROG | 37 | 173 | 36 |
| Accuray | Accuray Incorporated | 51 | 106 | 33 |
| adelapoite | Adela | 0 | 187 | 33 |
| gerryhanna | Gerry Hanna | 187 | 166 | 30 |
| JulieMcCrossin | Julie McCrossin | 60 | 243 | 30 |
| simongoldswort1 | Simon Goldsworthy | 74 | 141 | 30 |
| LionelREICHARDT | PHARMAGEEK | 68 | 125 | 29 |
| cancerphysicist | Ane Appelt | 179 | 137 | 28 |
| achoud72 | Dr Ananya | 33 | 116 | 26 |
| syeepei | YeePei Song | 67 | 45 | 26 |
| christian_roenn | Chr Rønn Hansen | 142 | 56 | 25 |
| ESTRO_RT | ESTRO | 296 | 121 | 25 |
| ClinOncologist | David Woolf | 116 | 106 | 23 |
| MPHRadiotherapy | MPHRadiotherapy | 0 | 122 | 23 |
| MichRadioactiva | MiVQ | 0 | 161 | 22 |
| TovarAris | MarÃa Isabel Tovar | 0 | 173 | 22 |
| WarrenBacorro | Warren Bacorro | 24 | 121 | 21 |
| antheasaif | Anthea Cree | 20 | 67 | 19 |
| lucindamorris23 | Lucinda Morris | 110 | 165 | 18 |
| Elekta | Elekta | 80 | 37 | 17 |
| finn_corinne | Corinne Faivre-Finn | 98 | 71 | 17 |
| FYOncologist | FYO | 55 | 68 | 17 |
| sandraturner49 | Sandra Turner | 54 | 182 | 17 |
# Includes both tweets and rtweets
showvals[1:40,] %>%
transform(screen_name = reorder(screen_name, n)) %>%
ggplot(aes(screen_name, n))+ geom_bar(stat = "identity") +
coord_flip() +
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x=NULL,y=NULL,
title="Top 40 tweeters of ESTRO37")
showvals=rt %>% filter(is_retweet==FALSE) %>%
select(favorite_count,retweet_count,screen_name,name) %>%
group_by(screen_name,name) %>%
summarise(fav_count=sum(favorite_count),
rt_count=sum(retweet_count),
n=n()) %>% arrange(-n)
showvals[1:40,] %>%
transform(screen_name = reorder(screen_name, n)) %>%
ggplot(aes(screen_name, n))+ geom_bar(stat = "identity") +
coord_flip() +
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x=NULL,y=NULL,
title="Top 40 retweeters of ESTRO37")
Here is that somewhat (very) dumb influence metric I cooked up or adapted from elsewhere, I can’t quite remember at this point. Either way I don’t put much value in it. It’s basically just the sum of favorites and retweets
library(viridis)
## Loading required package: viridisLite
showvals2= showvals %>% mutate(impact = fav_count + rt_count) %>%
arrange(-impact)
showvals2[1:40,] %>%
transform(screen_name = reorder(screen_name, impact)) %>%
ggplot(aes(screen_name, impact, fill = impact / n)) +
geom_bar(stat = "identity") +
coord_flip()+ ylab('Impact (numFavorites + numRetweets)') +
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_viridis(trans = "log", breaks = c(1, 5, 10, 50))
# Still using the hated word cloud
The word cloud gets a bad rap, I mean, it IS basically impossible to intrepret in any numerical or comparative sense. But I do still find it useful to get a quick overview of just what types of things people are talking about. And thus, wordclouds we go.
library(tidytext)
library(RColorBrewer)
tidy_df = rt %>% unnest_tokens(word,text2)
tw_stop<-data.frame(word=c("ESTRO37","estro37","rtt","n","24","30","1300" ,lexicon='whatevs'))
stop_words=filter(stopwordslangs,(lang=='en' | lang=="es") & p >.9999) %>% pull(word)
stop_words=tibble(word=stop_words)
tidy_cloud <- tidy_df %>%
anti_join(tw_stop) %>%
anti_join(stop_words)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Joining, by = "word"
hm=tidy_cloud %>%
count(word)
library(wordcloud)
wordcloud(hm$word, hm$n,max.words = 150,colors=brewer.pal(8,'Dark2'),random.order=FALSE,random.color=FALSE)
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : leadership could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : mundial could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : hypoxia could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : impact could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : satisfaction could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : bcsm could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : improving could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : lecture could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : present could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : science could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : inmunoonc could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : issues could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : practice could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : proud could not be fit on page. It will not be plotted.
OK, after all of the, what you actually really cared about was the network, right?
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are *required* to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see http://bit.ly/arialnarrow
m_g <- rt %>%
select(screen_name, mentions_screen_name) %>%
unnest(mentions_screen_name) %>%
filter(!is.na(mentions_screen_name)) %>%
group_by(screen_name,mentions_screen_name) %>%
summarise(weight=n()) %>%
graph_from_data_frame()
library(igraph)
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
library(ggraph)
m_graph <- m_g
dfv=data.frame(V=as.vector(V(m_graph)),screen_name=V(m_graph)$name,degree(m_graph))
names(dfv)[3]="degree"
dfv=cbind(dfv,quantile=cut(dfv$degree,breaks=quantile(dfv$degree,probs=c(0,.95,1)),labels=c("Bottom99",'Top1'),include.lowest=T))
dfv$quantile=as.character(dfv$quantile)
library(dplyr)
dfv2=arrange(dfv,desc(quantile))
dfv3=dfv2[dfv2$quantile=='Top1',]
print(nrow(dfv3))
## [1] 52
red_gr=induced_subgraph(m_graph,dfv3$V)
#g=as_tbl_graph(red_gr) %>%
# mutate(pop=centrality_pagerank())
#ggraph(g,layout='kk')+
# geom_edge_fan(aes(alpha=..index..),show.legend=FALSE) +
# geom_node_point(aes(size=pop),show.legend=FALSE) +geom_node_label(aes(label=name))+theme_graph()
V(red_gr)$node_label <- unname(ifelse(degree(red_gr)[V(red_gr)] > 20, names(V(red_gr)), ""))
V(red_gr)$node_size <- unname(ifelse(degree(red_gr)[V(red_gr)] > 20, degree(red_gr), 0))
ggraph(red_gr, layout = 'linear', circular = TRUE) +
geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) +
geom_node_label(aes(label=node_label, size=node_size),
label.size=0, fill="#ffffff66", segment.colour="springgreen",
color="slateblue", repel=TRUE, family=font_rc, fontface="bold") +
coord_fixed() +
scale_size_area(trans="sqrt") +
labs(title="Mention Relationships", subtitle="Most mentioned screen names labeled. Darkers edges == more retweets. Node size == larger degree") +
theme_graph(base_family=font_rc) +
theme(legend.position="none")
# retweet analysis
rt_g=filter(rt, retweet_count > 0) %>%
select(screen_name, retweet_screen_name) %>%
filter(!is.na(retweet_screen_name)) %>%
graph_from_data_frame()
dfv=data.frame(V=as.vector(V(rt_g)),screen_name=V(rt_g)$name,degree(rt_g))
names(dfv)[3]="degree"
dfv=cbind(dfv,quantile=cut(dfv$degree,breaks=quantile(dfv$degree,probs=c(0,.9,1)),labels=c("Bottom99",'Top1'),include.lowest=T))
dfv$quantile=as.character(dfv$quantile)
library(dplyr)
dfv2=arrange(dfv,desc(quantile))
dfv3=dfv2[dfv2$quantile=='Top1',]
print(nrow(dfv3))
## [1] 88
ndf <- rt %>% filter(screen_name %in% dfv3$screen_name)
nrow(ndf %>% filter(!is.na(retweet_screen_name)))
## [1] 1600
nrow(ndf %>% filter(is.na(retweet_screen_name)))
## [1] 874
red_gr_rt=induced_subgraph(rt_g,dfv3$V)
V(red_gr_rt)$node_label <- unname(ifelse(degree(red_gr_rt)[V(red_gr_rt)] > 25, names(V(red_gr_rt)), ""))
V(red_gr_rt)$node_size <- unname(ifelse(degree(red_gr_rt)[V(red_gr_rt)] > 25, degree(red_gr_rt), 0))
ggraph(red_gr_rt, layout = 'linear', circular = TRUE) +
geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) +
geom_node_label(aes(label=node_label, size=node_size),
label.size=0, fill="#ffffff66", segment.colour="springgreen",
color="slateblue", repel=TRUE, family=font_rc, fontface="bold") +
coord_fixed() +
scale_size_area(trans="sqrt") +
labs(title="Retweet Relationships", subtitle="Most retweeted screen names labeled. Darkers edges == more retweets. Node size == larger degree") +
theme_graph(base_family=font_rc) +
theme(legend.position="none")